You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.


Distributions in Pandas


In [1]:
import pandas as pd
import numpy as np

In [13]:
for i in range(5):
    coinflip = np.random.binomial(1, 0.5)
    print(coinflip)


1
0
1
1
0

In [18]:
np.random.binomial(1000, 0.5)/1000


Out[18]:
0.503

In [14]:
chance_of_tornado = 0.01/100
np.random.binomial(100000, chance_of_tornado)


Out[14]:
13

In [30]:
chance_of_tornado = 0.01

tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)
    
two_days_in_a_row = 0
for j in range(1,len(tornado_events)-1):
    if tornado_events[j]==1 and tornado_events[j-1]==1:
        two_days_in_a_row+=1

print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))


101 tornadoes back to back in 2739.72602739726 years

In [35]:
np.random.uniform(0, 1)


Out[35]:
0.8477519368060076

In [38]:
np.random.normal(0.75)


Out[38]:
0.7418276998824014

Formula for standard deviation $$\sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \overline{x})^2}$$


In [39]:
distribution = np.random.normal(0.75,size=1000)

np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))


Out[39]:
0.96843810415836251

In [40]:
np.std(distribution)


Out[40]:
0.96843810415836251

In [41]:
import scipy.stats as stats
stats.kurtosis(distribution)


Out[41]:
0.03293898548813612

In [42]:
stats.skew(distribution)


Out[42]:
-0.1586539371740313

In [63]:
chi_squared_df2 = np.random.chisquare(10, size=10000)
stats.skew(chi_squared_df2)


Out[63]:
0.9433800012480495

In [44]:
chi_squared_df5 = np.random.chisquare(5, size=10000)
stats.skew(chi_squared_df5)


Out[44]:
1.3149434950712884

In [64]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

output = plt.hist([chi_squared_df2,chi_squared_df5], bins=200, histtype='step', 
                  label=['2 degrees of freedom','5 degrees of freedom'])
plt.legend(loc='upper right')


Out[64]:
<matplotlib.legend.Legend at 0x7fd217b186a0>

Hypothesis Testing


In [65]:
df = pd.read_csv('grades.csv')

In [66]:
df.head()


Out[66]:
student_id assignment1_grade assignment1_submission assignment2_grade assignment2_submission assignment3_grade assignment3_submission assignment4_grade assignment4_submission assignment5_grade assignment5_submission assignment6_grade assignment6_submission
0 B73F2C11-70F0-E37D-8B10-1D20AFED50B1 92.733946 2015-11-02 06:55:34.282000000 83.030552 2015-11-09 02:22:58.938000000 67.164441 2015-11-12 08:58:33.998000000 53.011553 2015-11-16 01:21:24.663000000 47.710398 2015-11-20 13:24:59.692000000 38.168318 2015-11-22 18:31:15.934000000
1 98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1 86.790821 2015-11-29 14:57:44.429000000 86.290821 2015-12-06 17:41:18.449000000 69.772657 2015-12-10 08:54:55.904000000 55.098125 2015-12-13 17:32:30.941000000 49.588313 2015-12-19 23:26:39.285000000 44.629482 2015-12-21 17:07:24.275000000
2 D0F62040-CEB0-904C-F563-2F8620916C4E 85.512541 2016-01-09 05:36:02.389000000 85.512541 2016-01-09 06:39:44.416000000 68.410033 2016-01-15 20:22:45.882000000 54.728026 2016-01-11 12:41:50.749000000 49.255224 2016-01-11 17:31:12.489000000 44.329701 2016-01-17 16:24:42.765000000
3 FFDF2B2C-F514-EF7F-6538-A6A53518E9DC 86.030665 2016-04-30 06:50:39.801000000 68.824532 2016-04-30 17:20:38.727000000 61.942079 2016-05-12 07:47:16.326000000 49.553663 2016-05-07 16:09:20.485000000 49.553663 2016-05-24 12:51:18.016000000 44.598297 2016-05-26 08:09:12.058000000
4 5ECBEEB6-F1CE-80AE-3164-E45E99473FB4 64.813800 2015-12-13 17:06:10.750000000 51.491040 2015-12-14 12:25:12.056000000 41.932832 2015-12-29 14:25:22.594000000 36.929549 2015-12-28 01:29:55.901000000 33.236594 2015-12-29 14:46:06.628000000 33.236594 2016-01-05 01:06:59.546000000

In [67]:
len(df)


Out[67]:
2315

In [68]:
early = df[df['assignment1_submission'] <= '2015-12-31']
late = df[df['assignment1_submission'] > '2015-12-31']

In [76]:
early.mean()


Out[76]:
assignment1_grade    74.972741
assignment2_grade    67.252190
assignment3_grade    61.129050
assignment4_grade    54.157620
assignment5_grade    48.634643
assignment6_grade    43.838980
dtype: float64

In [70]:
late.mean()


Out[70]:
assignment1_grade    74.017429
assignment2_grade    66.370822
assignment3_grade    60.023244
assignment4_grade    54.058138
assignment5_grade    48.599402
assignment6_grade    43.844384
dtype: float64

In [71]:
from scipy import stats
stats.ttest_ind?

In [72]:
stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])


Out[72]:
Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577)

In [73]:
stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])


Out[73]:
Ttest_indResult(statistic=1.3239868220912567, pvalue=0.18563824610067967)

In [74]:
stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])


Out[74]:
Ttest_indResult(statistic=1.7116160037010733, pvalue=0.087101516341556676)